InΒ [1]:
"""
Professional Network Analysis: Keyword Co-occurrence Study
=========================================================
Author: Rosalina Torres
Title: "The Science Behind the Art"
This module provides comprehensive network analysis tools for examining keyword
co-occurrence patterns in large datasets. It generates multiple visualization
perspectives to reveal community structures, importance rankings, and connection patterns.
Dependencies:
- pandas, networkx, matplotlib, numpy, seaborn
- Optional: python-louvain (for community detection)
"""
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import warnings
import time
# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
class NetworkAnalyzer:
"""
A comprehensive network analysis toolkit for keyword co-occurrence data.
This class handles data loading, network construction, and generates multiple
analytical visualizations to understand keyword relationships and importance.
"""
def __init__(self, data_path=None):
"""
Initialize the NetworkAnalyzer.
Args:
data_path (str, optional): Path to co-occurrence matrix CSV file
"""
self.data_path = data_path
self.co_occurrence_matrix = None
self.network = None
self.logger = None
def find_data_file(self):
"""
Locate the co-occurrence matrix file from common paths.
Returns:
str or None: Path to the data file if found, None otherwise
"""
if self.data_path and os.path.exists(self.data_path):
return self.data_path
common_paths = [
"co_occurrence_matrix.csv",
os.path.expanduser("~/Downloads/co_occurrence_matrix.csv"),
os.path.expanduser("~/Desktop/co_occurrence_matrix.csv")
]
for path in common_paths:
if os.path.exists(path):
print(f"β
Found data file: {path}")
return path
print("β οΈ No data file found in common locations")
return None
def load_data(self):
"""
Load and clean the co-occurrence matrix data.
Returns:
bool: True if data loaded successfully, False otherwise
"""
file_path = self.find_data_file()
if not file_path:
print("β Cannot proceed without data file")
return False
try:
self.co_occurrence_matrix = pd.read_csv(file_path, index_col=0)
print(f"π Loaded dataset: {self.co_occurrence_matrix.shape}")
# Clean keyword names
self.co_occurrence_matrix.columns = self.co_occurrence_matrix.columns.map(self._clean_keyword)
self.co_occurrence_matrix.index = self.co_occurrence_matrix.index.map(self._clean_keyword)
# Remove empty keywords
self.co_occurrence_matrix = self.co_occurrence_matrix.loc[
self.co_occurrence_matrix.index != '',
self.co_occurrence_matrix.columns != ''
]
print(f"π§Ή Cleaned data: {self.co_occurrence_matrix.shape}")
return True
except Exception as e:
print(f"β Error loading data: {e}")
return False
def _clean_keyword(self, keyword):
"""Clean and standardize keyword strings."""
if pd.isna(keyword):
return ""
cleaned = " ".join(str(keyword).replace("--", " ").split())
return cleaned.strip().lower()
def build_network(self, min_weight=1):
"""
Build a weighted network from the co-occurrence matrix.
Args:
min_weight (int): Minimum edge weight to include in network
Returns:
bool: True if network built successfully, False otherwise
"""
if self.co_occurrence_matrix is None:
print("β No data loaded. Call load_data() first.")
return False
print(f"π¨ Building network (min_weight={min_weight})...")
self.network = nx.Graph()
# Add edges based on co-occurrence weights
for word1 in self.co_occurrence_matrix.index:
for word2 in self.co_occurrence_matrix.columns:
if word1 != word2:
weight = self.co_occurrence_matrix.at[word1, word2]
if pd.notna(weight) and weight >= min_weight:
self.network.add_edge(word1, word2, weight=weight)
# Remove isolated nodes
isolated_nodes = list(nx.isolates(self.network))
self.network.remove_nodes_from(isolated_nodes)
print(f"β
Network built: {self.network.number_of_nodes()} nodes, {self.network.number_of_edges()} edges")
return True
def analyze_network_properties(self):
"""
Calculate and display key network properties.
Returns:
dict: Dictionary containing network metrics
"""
if self.network is None:
print("β No network available. Build network first.")
return {}
print("\nπ NETWORK ANALYSIS RESULTS")
print("=" * 50)
metrics = {
'nodes': self.network.number_of_nodes(),
'edges': self.network.number_of_edges(),
'density': nx.density(self.network),
'avg_degree': sum(dict(self.network.degree()).values()) / self.network.number_of_nodes(),
'components': nx.number_connected_components(self.network)
}
print(f"Total Keywords (Nodes): {metrics['nodes']}")
print(f"Total Connections (Edges): {metrics['edges']}")
print(f"Network Density: {metrics['density']:.4f}")
print(f"Average Degree: {metrics['avg_degree']:.2f}")
print(f"Connected Components: {metrics['components']}")
# Calculate centrality measures
try:
print("\nπ Calculating importance metrics...")
degree_centrality = nx.degree_centrality(self.network)
pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
# Top 10 most important keywords
top_keywords = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nπ TOP 10 MOST IMPORTANT KEYWORDS:")
for i, (keyword, score) in enumerate(top_keywords, 1):
print(f"{i:2d}. {keyword.title():<25} (Score: {score:.4f})")
metrics['top_keywords'] = top_keywords
except Exception as e:
print(f"β οΈ Error calculating centrality: {e}")
return metrics
def create_community_overview(self, figsize=(20, 16)):
"""Create community structure visualization."""
if self.network is None:
return
print("π¨ Creating Community Structure Overview...")
degree_centrality = nx.degree_centrality(self.network)
# Try community detection
try:
import community as community_louvain
communities = community_louvain.best_partition(self.network, weight='weight')
node_colors = [communities[node] for node in self.network.nodes()]
cmap = plt.cm.tab20
title_extra = f" ({len(set(communities.values()))} Communities)"
except ImportError:
node_colors = [degree_centrality[node] for node in self.network.nodes()]
cmap = plt.cm.viridis
title_extra = " (Degree-based Coloring)"
pos = nx.spring_layout(self.network, k=3, iterations=30, weight='weight')
node_sizes = [50 + degree_centrality[node] * 500 for node in self.network.nodes()]
plt.figure(figsize=figsize, facecolor='white')
nx.draw(self.network, pos, node_size=node_sizes, node_color=node_colors,
cmap=cmap, alpha=0.8, linewidths=1, edgecolors='white',
edge_color='gray', width=0.3)
plt.title(f'Keyword Network Community Structure{title_extra}\n'
f'{self.network.number_of_nodes()} Keywords, {self.network.number_of_edges()} Connections',
fontsize=20, fontweight='bold', pad=20)
# Add network statistics
stats_text = f"""Network Statistics:
Density: {nx.density(self.network):.4f}
Avg Degree: {sum(dict(self.network.degree()).values()) / self.network.number_of_nodes():.1f}
Components: {nx.number_connected_components(self.network)}"""
plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
plt.axis('off')
plt.tight_layout()
plt.show()
def create_importance_analysis(self, figsize=(20, 16)):
"""Create PageRank importance visualization."""
if self.network is None:
return
print("π¨ Creating Importance Analysis...")
try:
pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
except:
pagerank = {node: 1/self.network.number_of_nodes() for node in self.network.nodes()}
pos = nx.spring_layout(self.network, k=3, iterations=30, weight='weight')
node_sizes = [50 + pagerank[node] * 2000 for node in self.network.nodes()]
node_colors = [pagerank[node] for node in self.network.nodes()]
plt.figure(figsize=figsize, facecolor='white')
# Draw edges
nx.draw_networkx_edges(self.network, pos, edge_color='lightgray', width=0.3, alpha=0.5)
# Draw nodes with colorbar
nodes = nx.draw_networkx_nodes(self.network, pos, node_size=node_sizes,
node_color=node_colors, cmap=plt.cm.plasma,
alpha=0.8, linewidths=1, edgecolors='white')
plt.colorbar(nodes, label='Importance Score', shrink=0.8)
plt.title('Keyword Importance Analysis\nNode Size & Color = Importance Score',
fontsize=20, fontweight='bold', pad=20)
# Show top keywords
top_keywords = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
top_text = "Top 10 Keywords:\n" + "\n".join([
f"{i+1}. {node.title()}" for i, (node, _) in enumerate(top_keywords)
])
plt.text(0.02, 0.02, top_text, transform=plt.gca().transAxes,
fontsize=10, verticalalignment='bottom',
bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.9))
plt.axis('off')
plt.tight_layout()
plt.show()
def create_hub_analysis(self, figsize=(20, 16), top_n=25):
"""Create hub keywords visualization with labels."""
if self.network is None:
return
print(f"π¨ Creating Hub Analysis (Top {top_n} Keywords)...")
try:
pagerank = nx.pagerank(self.network, weight='weight', max_iter=100)
degree_centrality = nx.degree_centrality(self.network)
except:
pagerank = {node: 1 for node in self.network.nodes()}
degree_centrality = {node: 1 for node in self.network.nodes()}
# Get top N most important nodes
top_nodes = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:top_n]
hub_network = self.network.subgraph([node for node, _ in top_nodes]).copy()
if len(hub_network.nodes()) == 0:
print("β No hub nodes found")
return
pos = nx.spring_layout(hub_network, k=5, iterations=50, weight='weight')
node_sizes = [200 + pagerank[node] * 1500 for node in hub_network.nodes()]
node_colors = [degree_centrality[node] for node in hub_network.nodes()]
plt.figure(figsize=figsize, facecolor='white')
nx.draw(hub_network, pos, node_size=node_sizes, node_color=node_colors,
cmap=plt.cm.coolwarm, alpha=0.9, linewidths=2, edgecolors='black',
edge_color='darkblue', width=2, with_labels=True,
font_size=11, font_weight='bold', font_color='darkblue')
plt.title(f'Top {top_n} Hub Keywords Network\nWith Connection Patterns',
fontsize=20, fontweight='bold', pad=20)
# Add ranking list
ranking_text = f"Importance Rankings:\n" + "\n".join([
f"{i+1:2d}. {node.title()}: {score:.4f}"
for i, (node, score) in enumerate(top_nodes[:15])
])
plt.text(0.02, 0.98, ranking_text, transform=plt.gca().transAxes,
fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightcyan', alpha=0.9))
plt.axis('off')
plt.tight_layout()
plt.show()
def create_strong_connections(self, figsize=(20, 16), percentile=75):
"""Create strongest connections visualization."""
if self.network is None:
return
print(f"π¨ Creating Strongest Connections (Top {100-percentile}%)...")
edges = list(self.network.edges(data=True))
if len(edges) == 0:
print("β No edges found")
return
weights = [d['weight'] for u, v, d in edges]
threshold = np.percentile(weights, percentile)
strong_edges = [(u, v) for u, v, d in edges if d['weight'] >= threshold]
strong_network = self.network.edge_subgraph(strong_edges).copy()
if len(strong_network.nodes()) == 0:
print("β No strong connections found")
return
print(f" Showing {len(strong_network.edges())} strongest connections out of {len(edges)} total")
pos = nx.spring_layout(strong_network, k=4, iterations=50, weight='weight')
plt.figure(figsize=figsize, facecolor='white')
# Calculate edge widths and node sizes
edge_weights = [strong_network[u][v]['weight'] for u, v in strong_network.edges()]
max_weight, min_weight = max(edge_weights), min(edge_weights)
edge_widths = [(w - min_weight) / (max_weight - min_weight) * 8 + 1 for w in edge_weights]
node_degrees = dict(strong_network.degree())
node_sizes = [100 + node_degrees[node] * 50 for node in strong_network.nodes()]
# Draw network
nx.draw_networkx_edges(strong_network, pos, width=edge_widths,
edge_color='red', alpha=0.7)
nx.draw_networkx_nodes(strong_network, pos, node_size=node_sizes,
node_color='lightblue', alpha=0.9,
linewidths=2, edgecolors='navy')
# Label high-degree nodes
high_degree_nodes = {
node: node for node in strong_network.nodes()
if node_degrees[node] >= np.percentile(list(node_degrees.values()), 70)
}
if high_degree_nodes:
nx.draw_networkx_labels(strong_network, pos, labels=high_degree_nodes,
font_size=10, font_weight='bold', font_color='darkblue')
plt.title(f'Strongest Keyword Connections\nTop {100-percentile}% of Connections (Threshold: {threshold:.1f})',
fontsize=20, fontweight='bold', pad=20)
# Add statistics
stats_text = f"""Connection Statistics:
Strongest: {max_weight:.1f}
Weakest shown: {min_weight:.1f}
Average: {np.mean(edge_weights):.1f}
Total connections: {len(strong_network.edges())}"""
plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightpink', alpha=0.8))
plt.axis('off')
plt.tight_layout()
plt.show()
def run_complete_analysis(self):
"""
Execute complete network analysis pipeline.
This method runs the full analysis including data loading, network construction,
and generation of all four visualization perspectives.
"""
print("π COMPREHENSIVE KEYWORD NETWORK ANALYSIS")
print("=" * 60)
# Load and prepare data
if not self.load_data():
return False
# Build network
if not self.build_network():
return False
# Analyze network properties
metrics = self.analyze_network_properties()
# Generate visualizations
print(f"\nπ¨ Creating comprehensive visualizations...")
print("=" * 50)
print("\n1οΈβ£ COMMUNITY STRUCTURE OVERVIEW")
self.create_community_overview()
print("\n2οΈβ£ KEYWORD IMPORTANCE ANALYSIS")
self.create_importance_analysis()
print("\n3οΈβ£ HUB KEYWORDS ANALYSIS")
self.create_hub_analysis()
print("\n4οΈβ£ STRONGEST CONNECTIONS NETWORK")
self.create_strong_connections()
print(f"\nβ
ANALYSIS COMPLETE!")
print(f"Generated 4 comprehensive network visualizations")
print(f"Network contains {metrics.get('nodes', 0)} keywords with {metrics.get('edges', 0)} connections")
return True
def main():
"""
Main execution function for keyword network analysis.
Usage:
analyzer = NetworkAnalyzer('path/to/co_occurrence_matrix.csv')
analyzer.run_complete_analysis()
"""
# Initialize analyzer
analyzer = NetworkAnalyzer()
# Run complete analysis
success = analyzer.run_complete_analysis()
if success:
print("\nπ ANALYSIS SUMMARY")
print("=" * 30)
print("β
Data successfully loaded and analyzed")
print("β
Network structure revealed")
print("β
Key insights identified")
print("β
Visualizations generated")
print("\nRefer to the generated plots for detailed insights into")
print("keyword relationships, community structures, and importance rankings.")
else:
print("\nβ Analysis failed. Please check your data file and try again.")
if __name__ == "__main__":
main()
π COMPREHENSIVE KEYWORD NETWORK ANALYSIS ============================================================ β Found data file: co_occurrence_matrix.csv π Loaded dataset: (276, 276) π§Ή Cleaned data: (276, 276) π¨ Building network (min_weight=1)... β Network built: 276 nodes, 5115 edges π NETWORK ANALYSIS RESULTS ================================================== Total Keywords (Nodes): 276 Total Connections (Edges): 5115 Network Density: 0.1348 Average Degree: 37.07 Connected Components: 1 π Calculating importance metrics... π TOP 10 MOST IMPORTANT KEYWORDS: 1. Management (Score: 0.0646) 2. Organizational (Score: 0.0569) 3. Behavior (Score: 0.0229) 4. Business (Score: 0.0207) 5. Industrial (Score: 0.0195) 6. Relations (Score: 0.0153) 7. Psychology (Score: 0.0146) 8. Decision (Score: 0.0143) 9. Making (Score: 0.0135) 10. Personnel (Score: 0.0135) π¨ Creating comprehensive visualizations... ================================================== 1οΈβ£ COMMUNITY STRUCTURE OVERVIEW π¨ Creating Community Structure Overview...
2οΈβ£ KEYWORD IMPORTANCE ANALYSIS π¨ Creating Importance Analysis...
3οΈβ£ HUB KEYWORDS ANALYSIS π¨ Creating Hub Analysis (Top 25 Keywords)...
4οΈβ£ STRONGEST CONNECTIONS NETWORK π¨ Creating Strongest Connections (Top 25%)... Showing 1952 strongest connections out of 5115 total
β ANALYSIS COMPLETE! Generated 4 comprehensive network visualizations Network contains 276 keywords with 5115 connections π ANALYSIS SUMMARY ============================== β Data successfully loaded and analyzed β Network structure revealed β Key insights identified β Visualizations generated Refer to the generated plots for detailed insights into keyword relationships, community structures, and importance rankings.
InΒ [Β ]: